This is the analysis on Basal Monocytes - PD vs Control (MyND cohort)
samples. There are 301 Control samples, and 390 PD samples, in total of
691 sample.
This markdown is consisted of three parts -
For two threshold - ( just adj.P value and adj.P value + logFC ),
three analysis were performed. * Limma Gene Differential
* DEGenes (Differential Expressed Genes) and DESites (Differential
Edited RNA editing sites) from previous Limma editing differential
analysis)‘s comparison
* DEGenes’ pathway and DESites pathway’s - comparison
Genes that have mean TPM > 1 were selected to be analysed
(filtering by TPM value, and using that to filter the count
matrix)
Both Control and PD samples are distributed across three batches_ 691
samples (PD and Control) out of 811 MyND samples ( all disease )
#load("/Users/hyominseo/Desktop/RAJ/Gene_Differential/Basal/raw_data/MyND_833_gene_matrix.RData")
# Metadata for MyND
mynd_meta<-read_tsv("Basal/Basal_tsv/MyND_all_sample.tsv")%>%
column_to_rownames("sample")
mynd_meta$sex[mynd_meta$sex == "Male"] <- "M"
mynd_meta$sex[mynd_meta$sex == "Female"] <- "F"
# 833->811 samples that match meta data
mynd_tpm<-all_genes_tpm_filt%>%
dplyr::filter(rownames(.) %in% rownames(mynd_meta))
mynd_tpm<-as.data.frame(t(mynd_tpm))
# dplyr::filtering by Mean Gene TPM ( 58929 -> 14405)
mynd_tpm<-mynd_tpm%>%dplyr::filter(rowMeans(.)>1)
# 833->811dplyr::filtering count by metadata sample
mynd_count<-all_genes_counts_filt%>% dplyr::filter(rownames(.) %in% rownames(mynd_meta))
# count matrix has rownames of GENEID and colnames of SAMPLE
mynd_count<-as.data.frame(t(mynd_count))%>%dplyr::filter(rownames(.) %in% rownames(mynd_tpm))
# Normalize
mynd_norm<-calcNormFactors(mynd_count,method = "TMM")
filepath<-file.path("/Users/hyominseo/Desktop/RAJ/Gene_Differential/Basal/")
save( mynd_count,mynd_tpm,mynd_meta,mynd_norm, file = file.path(filepath, "MyND_811_gene_data.RData"))load("/Users/hyominseo/Desktop/RAJ/Gene_Differential/Basal/MyND_811_gene_data.RData")
all_gene<-as.data.frame(rownames(mynd_count))%>%
dplyr::rename("GENEID"="rownames(mynd_count)")
gencode<-read_tsv("gencode.v30.tx2gene.tsv")%>%
distinct(GENEID, .keep_all=TRUE)%>%dplyr::select(-c("TXNAME"))
all_gene_name<-merge(all_gene,gencode, by =
"GENEID")%>%dplyr::select(GENENAME)
filepath<-file.path("/Users/hyominseo/Desktop/RAJ/Gene_Differential/Basal/Basal_tsv")
write.table(all_gene_name, file = file.path(filepath,"PD_Control_All_Gene_Names.tsv"),
row.names = F, sep = "\t", quote = F)load("/Users/hyominseo/Desktop/RAJ/Gene_Differential/Basal/MyND_811_gene_data.RData")
# All Batch - PD vs Control
support<-mynd_meta%>%dplyr::filter(grepl("Control|PD",disease))%>%
dplyr::select(c('batch','disease','sex','age'))
writeLines("td, th { padding : 6px } th { background-color : blue ; color : white; border : 1px solid white; } td { color : blue ; border : 1px solid blue }", con = "mystyle.css")
knitr::kable(table(support$batch,support$disease), format = "simple", table.attr = "style='width:80%;'",caption = "Basal Monocytes 691 Batch Distribution")| Control | PD | |
|---|---|---|
| batch_1 | 84 | 126 |
| batch_2 | 200 | 143 |
| batch_3 | 17 | 121 |
string<-"Genes that have tpm>1 in basal monocytes for PD and Control sample: 58929 ->"
number<-as.integer(nrow(mynd_count))
print(paste(string,number))## [1] "Genes that have tpm>1 in basal monocytes for PD and Control sample: 58929 -> 14405"
reference: https://ucdavis-bioinformatics-training.github.io/2018-June-RNA-Seq-Workshop/thursday/DE.html
reference:
text=limma%20is%20an%20R%20package,analyses%20of%20RNA%2DSeq%20data.
Makes PD_Control.tsv which is the entire Limma gene differential
toptable for all 14405 genes
The filtered gene count matrix is normalized using
calcNormfactros.
Covariate ‘Batch’ encompasses all technical variables
diff_PD<-function(support_in,title ){
count_df<-as.data.frame(t(mynd_count))%>%
dplyr::filter(rownames(.)%in%rownames(support_in))
count_df<-as.data.frame(t(count_df))
# Normalizing count matrix
count_norm<-calcNormFactors(count_df, method="TMM")
disease<-as.factor(support_in$disease)
sex<-as.factor(support_in$sex)
age<-support_in$age
batch<-as.factor(support_in$batch)
model<-model.matrix(~disease+sex+age+batch)
dge<-DGEList(counts=count_df, samples = support_in, norm.factors= count_norm)
v <- voom(dge, model)
vfit<-lmFit(v,model)
efit<-eBayes(vfit)
print(summary(decideTests(efit, p.value = "0.05")))
DE_genes<-topTable(efit, sort.by = "p",n=Inf, coef = 2)
DE_genes$GENEID<-rownames(DE_genes)
DE_genes <- DE_genes[,c("GENEID", names(DE_genes)[1:6])]
gencode<-read_tsv("gencode.v30.tx2gene.tsv")%>%
distinct(GENEID, .keep_all=TRUE)%>%dplyr::select(-c("TXNAME"))
DE_genes_ID<-merge(DE_genes,gencode, by = "GENEID")
filepath<-file.path("/Users/hyominseo/Desktop/RAJ/Gene_Differential/Basal/TopTable")
write.table(DE_genes_ID, file = file.path(filepath,title),
row.names = F, sep = "\t", quote = F)
}
#diff_PD(support, "PD_Control.tsv")Only PD vs Control, other trials (AD,AD+MCI, MCI vs Basal) have no significant count.
vol_plot<-function(table_in, point_size, text_size,title_text_size,plot_title,label_size){
t<-read_tsv(table_in)
t$DE_genes<-case_when(
t$adj.P.Val < 0.05 & abs(t$logFC) > 1 ~ "TRUE",
t$adj.P.Val > 0.05 & abs(t$logFC) < 1 ~ "FALSE"
)
# Adding UP and Down regulated gene count
t$DE_direction<-case_when(
t$DE_genes == "TRUE" & t$logFC > 0.00 ~ "UP",
t$DE_genes == "TRUE" & t$logFC < 0.00 ~ "DOWN"
)
highlight_df<-t%>%dplyr::filter(grepl("ADAR.*|APOBEC.*", GENENAME))
vol_plot<- ggplot(t, aes(x=logFC, y=-log10(P.Value), col = DE_direction))+
geom_point(size =point_size, alpha=0.7)+
theme_bw()+
scale_color_manual(values = wes_palette(n=3,name="GrandBudapest1"))+
geom_vline(xintercept=c(-1,1), col ="black",linetype="longdash")+
labs(title=plot_title)+
theme(plot.title = element_text(size=title_text_size, face="bold", hjust=0.5))+
theme(text = element_text(size = text_size)) +
theme(legend.position='none') +
geom_text_repel(data = highlight_df, aes(label = GENENAME),
box.padding = 0, max.overlaps = Inf,
point.padding = 0,segment.size = 0.2,
segment.color = "black",point.size =3,
fontface="bold",hjust =0,
color="black",size = label_size)+
geom_point(data = highlight_df, size =point_size, color = "black")
vol_plot
}
# Extracting just significantly DE genes from the entire TopTable
# Makes ~_DEG.tsv
sig_genes<-function(toptable){
df<-read_tsv(toptable)
df$DE_genes<-case_when(
# defining sig
df$adj.P.Val < 0.05 & abs(df$logFC) > 1 ~ "TRUE",
df$adj.P.Val > 0.05 & abs(df$logFC) < 1 ~ "FALSE"
)
df<-df%>%dplyr::filter(!grepl("FALSE", DE_genes))
df<-df%>%drop_na(DE_genes)
df$DE_direction<-case_when(
df$logFC > 0.00 ~ "UP",
df$logFC < 0.00 ~ "DOWN"
)
df
}
# pd_DEG<-sig_genes("MyND/TopTable/PD_Control.tsv")
## pd_DEG$GENEID<-gsub("\\..*","",pd_DEG$GENEID)
# filepath<-file.path("/Users/hyominseo/Desktop/RAJ/Gene_Differential/Basal/TopTable")
# write.table(pd_DEG, file = file.path(filepath,"PD_Control_DEG.tsv"), row.names = F, sep = "\t", quote = F)PD vs Control DEGenes Volcano Plot
PD_Control_DEG<- read_tsv("Basal/TopTable/PD_Control_P_Gene.tsv")
DEG_UP_Count<-c( as.integer(length(which(PD_Control_DEG$DE_direction == "UP"))))
DEG_DOWN_Count<-c( as.integer(length(which(PD_Control_DEG$DE_direction == "DOWN"))))
Model<-c("PD_Control_PVal_logFC")
df<-data.frame( Model,DEG_UP_Count, DEG_DOWN_Count)
knitr::kable(df, format = "simple", table.attr = "style='width:80%;'",caption = "Basal Monocytes threashold(P) DEG counts", col.names=c('DEG_Limma_Model','DEG_UP_Count','DEG_DOWN_Count'))| DEG_Limma_Model | DEG_UP_Count | DEG_DOWN_Count |
|---|---|---|
| PD_Control_PVal_logFC | 5362 | 5208 |
PD_Control_1_toptable.tsv is a from previous Limma RNA editing
differential analysis with the same model of (disease + sex + age +
batch)
When DEGenes and DESites annoatated with Gencodes were compared to find
any overlapping genes, there was only 1 gene found.
# GENCODE
gencode<-read_tsv("gencode.v30.tx2gene.tsv")%>%
distinct(GENEID, .keep_all=TRUE)%>%
dplyr::select(-c("TXNAME"))
# Taking everything after .
# ".xx" is mismatched in genecode and our annoation file
gencode$GENEID<-gsub("\\..*","",gencode$GENEID)
# ANNOTATION
PD_anno<- read_tsv("Basal/Local_Pipeline_Result/all_sites_pileup_annotation.gz")
PD_anno<- PD_anno %>%
dplyr::select(ESid2,ensembl_id,
location = Func.refGene, mutation =ExonicFunc.refGene)%>%
dplyr::rename("ESid" = "ESid2")%>%
# changing ensemble_ID to gene
dplyr::rename("GENEID" = "ensembl_id")
# LIMMA SITES : 221 DE sites (adj.P.Val<= 0.05)
PD_Control_DE_Sites<-read_tsv("Basal/TopTable/PD_Control_1_toptable.tsv")%>%
dplyr::filter(adj.P.Val<= 0.05)
# Adding DESite toptable + Annotation
PD_Control_DE_Sites_anno<-merge(PD_Control_DE_Sites, PD_anno, by ="ESid")
# Taking every ".xx" out to match with gencode
PD_Control_DE_Sites_anno$GENEID<-gsub("\\..*","",PD_Control_DE_Sites_anno$GENEID)
# Adding DESite toptable + Annoatation + Genecode
PD_Control_DE_Sites_anno_gencode<-merge(PD_Control_DE_Sites_anno,gencode, by = "GENEID")
filepath<-file.path("/Users/hyominseo/Desktop/RAJ/Gene_Differential/Basal/TopTable")
write.table(PD_Control_DE_Sites_anno_gencode, file = file.path(filepath,"PD_Control_DE_Sites.tsv"),
row.names = F, sep = "\t", quote = F)PD_Control_DEG<-read_tsv("Basal/TopTable/PD_Control_DEG.tsv")
# Taking every ".xx" out to match with gencode+anno+DES matrix
PD_Control_DEG$GENEID<-gsub("\\..*","",PD_Control_DEG$GENEID)
PD_Control_DE_Sites<-read_tsv("Basal/TopTable/PD_Control_DE_Sites.tsv")
number<-length(intersect(PD_Control_DEG$GENEID, PD_Control_DE_Sites$GENEID))
gene<-intersect(PD_Control_DEG$GENEID, PD_Control_DE_Sites$GENEID)
string_1<-"Number of Gene that is present in both Basal Monocyte DEG(P/logFC) and DES:"
print(paste0(string_1,number))## [1] "Number of Gene that is present in both Basal Monocyte DEG(P/logFC) and DES:1"
Now, we are removing the logFC threshold when determining the Differential Expressed genes, keeping only the adj.P value > 0.05 threshold.
PD_Control_P_Gene<-read_tsv("Basal/TopTable/PD_Control.tsv")
PD_Control_P_Gene$DE_genes<-case_when(
# defining sig DEGs
PD_Control_P_Gene$adj.P.Val < 0.05 ~ "TRUE",
PD_Control_P_Gene$adj.P.Val > 0.05 ~ "FALSE"
)
DEgenes_count<-as.integer(length(which(PD_Control_P_Gene$DE_genes == "TRUE")))
string<-"## Number of Differentially Expressed Genes (adj.P.Val < 0.05 ) are: "
print(paste(string, DEgenes_count))
# Adding UP and Down regulated gene count
PD_Control_P_Gene$DE_direction<-case_when(
PD_Control_P_Gene$DE_genes == "TRUE" & PD_Control_P_Gene$logFC > 0.00 ~ "UP",
PD_Control_P_Gene$DE_genes == "TRUE" & PD_Control_P_Gene$logFC < 0.00 ~ "DOWN"
)
#filepath<-file.path("/Users/hyominseo/Desktop/RAJ/Gene_Differential/Basal/TopTable")
#write.table(PD_Control_P_Gene, file = file.path(filepath,"PD_Control_P_Gene.tsv"), row.names = F, sep = "\t", quote = F)PD_Control_P_Gene<- read_tsv("Basal/TopTable/PD_Control_P_Gene.tsv")
DEG_UP_Count<-c( as.integer(length(which(PD_Control_P_Gene$DE_direction == "UP"))))
DEG_DOWN_Count<-c( as.integer(length(which(PD_Control_P_Gene$DE_direction == "DOWN"))))
Model<-c("PD_Control_P_Val")
df<-data.frame( Model, DEG_UP_Count, DEG_DOWN_Count)
knitr::kable(df, format = "simple", table.attr = "style='width:80%;'",caption = "Basal Monocytes only P val threashold DEG counts", col.names=c('DEG_Limma_Model','DEG_UP_Count','DEG_DOWN_Count'))| DEG_Limma_Model | DEG_UP_Count | DEG_DOWN_Count |
|---|---|---|
| PD_Control_P_Val | 5362 | 5208 |
There are many APOBEC gene family that are found, which previosuly were considered not DE since they had logFC less than 1.
PD_Control_P_Gene<-read_tsv("Basal/TopTable/PD_Control_P_Gene.tsv")
PD_Control_P_Gene_a<-PD_Control_P_Gene%>%dplyr::filter(grepl("ADAR.*|APOBEC.*", GENENAME))
PD_Control_P_Gene_a<-PD_Control_P_Gene_a%>%dplyr::select(-c(AveExpr, t, B,DE_genes))%>%arrange(desc(logFC))
knitr::kable(PD_Control_P_Gene_a, "simple", caption = "A~ genes in PD_Control_P_Gene", table.attr = "style='width:100%;'")| GENEID | logFC | P.Value | adj.P.Val | GENENAME | DE_direction |
|---|---|---|---|---|---|
| ENSG00000179750.16 | 0.8436156 | 0.0000079 | 0.0000171 | APOBEC3B | UP |
| ENSG00000244509.4 | 0.6488530 | 0.0000000 | 0.0000000 | APOBEC3C | UP |
| ENSG00000128394.17 | 0.3646500 | 0.0000206 | 0.0000426 | APOBEC3F | UP |
| ENSG00000197381.16 | 0.1379390 | 0.0014782 | 0.0024489 | ADARB1 | UP |
| ENSG00000160710.16 | 0.0932445 | 0.0251830 | 0.0351752 | ADAR | UP |
| ENSG00000128383.13 | -0.2746187 | 0.0000091 | 0.0000195 | APOBEC3A | DOWN |
| ENSG00000239713.9 | -0.3238005 | 0.0000000 | 0.0000000 | APOBEC3G | DOWN |
| ENSG00000243811.10 | -0.3430040 | 0.0000006 | 0.0000014 | APOBEC3D | DOWN |
Gprofiler: https://www.proquest.com/docview/2597928351?parentSessionId=yru5MIKlo8EUo5jpqSL%2BNzkuC9dAD7YRoRLusR%2BvNV4%3D /
https://cran.r-project.org/web/packages/gprofiler2/vignettes/gprofiler2.html /
Again, a gene is considered significantly differentially expressed with the threshold of : (adj.P.Val<= 0.05 and logFC)
Pathways of DEGenes and DESites (annotated by genename,and with relazed threshold) are analysed. Gprofiler uses custom domain, which is the vector of all gene names found in samples ( the subjects to the DEG analysis)
all_gene_name<-read_tsv("Basal/Basal_tsv/PD_Control_All_Gene_Names.tsv")
all_gene_name<-all_gene_name[['GENENAME']]
gprofiler<-function(file_name){
df<-read_tsv(file_name)
df_up<-subset(df, logFC >0)
df_down<-subset(df, logFC <0)
gp_up<-gost(row.names(df_up),organism = "hsapiens",
domain_scope = "custom",
custom_bg=all_gene_name)
gp_down<-gost(row.names(df_down),organism ="hsapiens",
domain_scope = "custom",
custom_bg=all_gene_name)
string_1<-"Number of UP regulated genes:"
num_1<-as.integer(nrow(df_up))
string_2<-"Number of DOWN regulated genes:"
num_2<-as.integer(nrow(df_down))
print(paste(string_1,num_1,string_2,num_2))
multi_gp<- gost(list("up-regulated" = row.names(df_up),
"down-regulated" = row.names(df_down)))
multi_gp
}
process_path<-function(path){
path<-path%>%dplyr::filter(!grepl("GO:CC|GO:MF|CORUM|HPA|REAC|WP|HP|TF", path$source))
path$log_P_value<-(-log(path$p_value))
path<-path%>%dplyr::select(c(query, p_value,term_id,term_name, source,log_P_value))
path
}## [1] "Number of UP regulated genes: 417 Number of DOWN regulated genes: 212"
| Direction_Source | Count | Mean |
|---|---|---|
| down_go | 388 | 10.586654 |
| down_kegg | 63 | 10.357275 |
| up_go | 657 | 12.329719 |
| up_kegg | 58 | 9.466644 |
Pathways of DESites found in limma model with PD vs Control contrast.
## [1] "Number of UP regulated genes: 179 Number of DOWN regulated genes: 41"
| Direction_Source | Count | Mean |
|---|---|---|
| down_go | 46 | 9.305066 |
| down_kegg | 11 | 11.649200 |
| up_go | 297 | 9.991215 |
| up_kegg | 61 | 9.508614 |
text_size =12
title_text_size = 12
point_size=2
legend_size = 0.6
legend_text_size = 8
label_size = 2
inplot_text_size=4
pd_vol_plot<-vol_plot("Basal/TopTable/PD_Control_P_Gene.tsv",point_size, text_size, title_text_size, "Differentially Expressed Genes",label_size)
pd_path_pval_plot<-path_pval_plot("Basal/TopTable/PD_DEG_DES_path_match.tsv","Shared Pathways",point_size,text_size,title_text_size)
pd_1<-ggarrange(pd_vol_plot, pd_path_pval_plot,
labels=c("C","D"),
font.label=list(color="black",size= text_size),
ncol=2)
pd_mut_logfc_plot<-mut_logfc_plot("Basal/TopTable/PD_DEG_DES_match.tsv",point_size,text_size, "Mutation", legend_size,label_size)
pd_loc_logfc_plot<-loc_logfc_plot("Basal/TopTable/PD_DEG_DES_match.tsv",point_size,text_size, "Location", legend_size,label_size,inplot_text_size)
pd_2<-ggarrange(pd_mut_logfc_plot,pd_loc_logfc_plot,
labels=c("E","F"),
font.label=list(color="black",size= text_size),
ncol=2)
tpm_rate_plot<-
ggplot(ed_1_merge,aes(x=Expression_TPM, y=Editing_Rate))+
theme_bw() +
geom_jitter(aes(color=Disease),alpha=0.8, size = 3)+
scale_color_manual(values = wes_palette(n=4,name="GrandBudapest1"))+
stat_cor(size = inplot_text_size,
label.x =0,
label.y=0.8)+
labs(x = "log(Expression TPM)", y = "RNA Editing Rate")+
theme(axis.text.x=element_text(size=text_size),
axis.text.y=element_text(size=text_size))+
theme(text = element_text(size =text_size))+
labs(title="MTRNR2L8 Noncoding TPM vs Editing Rate")+
theme(plot.title = element_text(size=10, face="bold", hjust=0.5))+
theme(legend.key.size = unit(legend_size,'cm'),
legend.title = element_text(color = "black", size =legend_text_size),
legend.text = element_text(color = "black", size = legend_text_size))+
guides(color = guide_legend(override.aes = list(size = 7)))
tpm_rate_plot<-ggarrange(tpm_rate_plot,
labels =c("G"),
font.label=list(color="black",size= text_size))
tpm_rate_plot<-annotate_figure(tpm_rate_plot,
top=text_grob("DES and DEG Shared Gene",
color = "black", face = "bold", size =title_text_size))
all_plot<-ggarrange(pd_1,pd_2,tpm_rate_plot, nrow=3)
ggsave(plot=all_plot,filename="Basal/Figures/figure4:basal_all_plot.jpg",width = 10, height = 8,dpi = 600)Basal All plots
PD_Control_DE_Sites<-read_tsv("Basal/TopTable/PD_Control_1_toptable.tsv")%>%
dplyr::filter(adj.P.Val <= 0.05)
PD_Control_DE_Sites_Gene<-read_tsv("Basal/TopTable/PD_Control_DE_Sites.tsv")
PD_Control_DEG<-read_tsv("Basal/TopTable/PD_Control_DEG.tsv")
PD_Control_P_Gene<-read_tsv("Basal/TopTable/PD_Control_P_Gene.tsv")
Category<-c("DE_Sites, Site count", "DE_Sites_Gencode_Annotated, Gene count", "DE_Genes, Gene count","P_val_Genes, Gene count")
Threshold<-c("adj.P < 0.05","adj.P < 0.05","adj.P.Val < 0.05 & abs(logFC) > 1","adj.P.Val < 0.05")
Count<-c(as.integer(nrow(PD_Control_DE_Sites)), as.integer(nrow(PD_Control_DE_Sites_Gene)),
as.integer(nrow(PD_Control_DEG)),as.integer(nrow(PD_Control_P_Gene)))
Unique_Gene<- c("All Sites Unique",as.integer(length(unique(PD_Control_DE_Sites_Gene$GENENAME))),
as.integer(length(unique(PD_Control_DEG$GENENAME))),
as.integer(length(unique(PD_Control_P_Gene$GENENAME))))
df<-data.frame(Threshold, Category, Count, Unique_Gene)
knitr::kable(df, "simple", caption = "All Count", table.attr = "style='width:100%;'")vol_plot<-function(t, point_size, text_size,title_text_size,plot_title,label_size){
vol_plot<- ggplot(t, aes(x=logFC, y=-log10(P.Value), col = DE_direction))+
geom_point(size =point_size, alpha=0.7)+
theme_bw()+
scale_color_manual(values = wes_palette(n=5,name="Rushmore1"))+
geom_vline(xintercept=c(-1,1), col ="black",linetype="longdash")+
labs(title=plot_title)+
theme(plot.title = element_text(size=title_text_size, face="bold", hjust=0.5))+
theme(text = element_text(size = text_size)) +
theme(legend.position='none')
vol_plot
}
process_deg<-function(table_in){
t<-read_tsv(table_in)
t$DE_genes<-case_when(
t$adj.P.Val < 0.05 & abs(t$logFC) > 1 ~ "TRUE",
t$adj.P.Val > 0.05 & abs(t$logFC) < 1 ~ "FALSE"
)
# Adding UP and Down regulated gene count
t$DE_direction<-case_when(
t$DE_genes == "TRUE" & t$logFC > 0.00 ~ "UP",
t$DE_genes == "TRUE" & t$logFC < 0.00 ~ "DOWN"
)
t
}pd_risk<-process_deg("Basal/TopTable/PD_Control.tsv")
highlight_df<-pd_risk%>%dplyr::filter(grepl("LRRK2|PLCG2|PILRA",GENENAME))
pd_risk_plot<-vol_plot(pd_risk,point_size, text_size, title_text_size, "PD vs Control ",label_size)+
geom_label_repel(data = highlight_df, aes(label = GENENAME, col = GENENAME),
box.padding = 0, max.overlaps = 3,
point.padding = 0, force = 80,
segment.size = 0.2,
point.size =point_size,
fontface="bold",
nudge_x = -0.1,hjust =0,
size = label_size)+
geom_point(data = highlight_df, aes(label = GENENAME, col = GENENAME),size=3)+
theme(legend.position = "none")
ggsave(plot=pd_risk_plot,filename="Basal/Figures/figure5:basal_risk_plot.jpg",width = 10, height = 4,dpi = 600)